From 126d4342c64edb68b9a1dfb9cb1ae0db20c63b67 Mon Sep 17 00:00:00 2001 From: Keir Fraser Date: Thu, 8 May 2008 14:33:31 +0100 Subject: [PATCH] xend: Fix and improve error handling for failed suspend/migrate This has been broken since cset 16964:5d84464dc1fc Also deal better with very early errors (close sender side socket) Signed-off-by: Steven Hand --- tools/python/xen/xend/XendCheckpoint.py | 14 ++------------ tools/python/xen/xend/XendDomain.py | 6 ++++-- tools/python/xen/xend/XendDomainInfo.py | 13 ++++++++++++- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/tools/python/xen/xend/XendCheckpoint.py b/tools/python/xen/xend/XendCheckpoint.py index 6cb9a4a7d6..40f2c08927 100644 --- a/tools/python/xen/xend/XendCheckpoint.py +++ b/tools/python/xen/xend/XendCheckpoint.py @@ -81,8 +81,6 @@ def save(fd, dominfo, network, live, dst, checkpoint=False, node=-1): # thing is useful for debugging. dominfo.setName('migrating-' + domain_name) - done_suspend = 0 - try: dominfo.migrateDevices(network, dst, DEV_MIGRATE_STEP1, domain_name) @@ -110,7 +108,6 @@ def save(fd, dominfo, network, live, dst, checkpoint=False, node=-1): log.debug("Suspending %d ...", dominfo.getDomid()) dominfo.shutdown('suspend') dominfo.waitForShutdown() - done_suspend = 1 dominfo.migrateDevices(network, dst, DEV_MIGRATE_STEP2, domain_name) log.info("Domain %d suspended.", dominfo.getDomid()) @@ -154,16 +151,9 @@ def save(fd, dominfo, network, live, dst, checkpoint=False, node=-1): pass except Exception, exn: - log.exception("Save failed on domain %s (%s).", domain_name, + log.exception("Save failed on domain %s (%s) - resuming.", domain_name, dominfo.getDomid()) - - # If we didn't get as far as suspending the domain (for - # example, we couldn't balloon enough memory for the new - # domain), then we don't want to re-plumb the devices, as the - # domU will not be expecting it. - if done_suspend: - log.debug("XendCheckpoint.save: resumeDomain") - dominfo.resumeDomain() + dominfo.resumeDomain() try: dominfo.setName(domain_name) diff --git a/tools/python/xen/xend/XendDomain.py b/tools/python/xen/xend/XendDomain.py index 39e49d13ac..16c93919ac 100644 --- a/tools/python/xen/xend/XendDomain.py +++ b/tools/python/xen/xend/XendDomain.py @@ -1308,8 +1308,10 @@ class XendDomain: sock.send("receive\n") sock.recv(80) - XendCheckpoint.save(sock.fileno(), dominfo, True, live, dst, node=node) - sock.close() + try: + XendCheckpoint.save(sock.fileno(), dominfo, True, live, dst, node=node) + finally: + sock.close() def domain_save(self, domid, dst, checkpoint=False): """Start saving a domain to file. diff --git a/tools/python/xen/xend/XendDomainInfo.py b/tools/python/xen/xend/XendDomainInfo.py index 607a97aef5..7ebb0bf3f3 100644 --- a/tools/python/xen/xend/XendDomainInfo.py +++ b/tools/python/xen/xend/XendDomainInfo.py @@ -2378,8 +2378,19 @@ class XendDomainInfo: def resumeDomain(self): log.debug("XendDomainInfo.resumeDomain(%s)", str(self.domid)) - if self.domid is None: + # resume a suspended domain (e.g. after live checkpoint, or after + # a later error during save or migate); checks that the domain + # is currently suspended first so safe to call from anywhere + + xeninfo = dom_get(self.domid) + if xeninfo is None: + return + if not xeninfo['shutdown']: return + reason = shutdown_reason(xeninfo['shutdown_reason']) + if reason != 'suspend': + return + try: # could also fetch a parsed note from xenstore fast = self.info.get_notes().get('SUSPEND_CANCEL') and 1 or 0 -- 2.30.2